!pip install sklearn-som
!pip install minisom
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from minisom import MiniSom
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import adjusted_rand_score, silhouette_samples, silhouette_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics.cluster import pair_confusion_matrix
from sklearn_som.som import SOM
from sklearn import preprocessing
from scipy.cluster.hierarchy import dendrogram
import numpy as np
def rbf(x1, x2, gamma = None):
'''
RBF: Radial basis function or guassian kernel
----------------------------------------------
:param: x1: NxD transposed feature space
:param: x2: NxD feature space
:param: gamma: 1/2(sigma-square)
:return type: kernel(Gram) matrix
'''
if not gamma:
gamma = 1
else:
gamma = gamma
if x1.ndim == 1 and x2.ndim == 1:
return np.exp(-gamma * np.linalg.norm(x1 - x2))
elif (x1.ndim > 1 and x2.ndim == 1) or (x1.ndim == 1 and x2.ndim > 1):
return np.exp(-gamma * np.linalg.norm(x1 - x2, axis = 1))
elif x1.ndim > 1 and x2.ndim > 1:
return np.exp(-gamma * np.linalg.norm(x1[:, np.newaxis] - x2[np.newaxis, :], axis = 2))
class kkMeans:
def __init__(self, k = None, gamma = None, d = None):
'''
:param: k: number of clusters
:param: kernel
'''
super().__init__()
if not gamma:
gamma = 5
self.gamma = gamma
else:
self.gamma = gamma
if not d:
d = 3
self.d = d
else:
self.d = d
if not k:
k = 2
self.k = k
else:
self.k = k
return
def kernelize(self, x1, x2):
return rbf(x1, x2, gamma = self.gamma)
#compute suqared kernel distance
def distance(self, kappa):
self.dist_w = np.zeros(self.k)
c_k_init = np.ones(kappa.shape[0])
for ii in range(self.k):
self.c_k_indices = self.clusters == ii
self.c_k = np.sum(c_k_init[self.c_k_indices])
self.c_k_squared = np.square(self.c_k)
self.kappa_ii = kappa[self.c_k_indices][:, self.c_k_indices]
self.distance_ii[:, ii] += np.sum(self.kappa_ii)/self.c_k_squared - 2*\
np.sum(kappa[:, self.c_k_indices], axis = 1)/self.c_k
return self.distance_ii
#fit and return cluster labels
def fit_predict(self, X, iteration = None, halt = None):
'''
:param: X: NxD Feature
:param: iteration: 100
:param: tolerance: 1e-3 default
'''
if not halt:
halt = 1e-3
self.halt = halt
else:
self.halt = halt
if not iteration:
iteration = 100
self.iteration = iteration
else:
self.iteration = iteration
self.X = X
N, D = X.shape
self.distance_ii = np.zeros((N, self.k))
self.kx = self.kernelize(self.X, self.X)
self.clusters = np.random.randint(self.k, size = N)
'''iterate by checking to see if new and previous cluster
labels are similar. If they are, algorithm converges and halts..
'''
for ii in range(self.iteration):
#compute distance
self.distance_k = self.distance(self.kx)
self.prev_clusters = self.clusters
self.clusters = self.distance_k.argmin(axis=1)
if 1 - float(np.sum((self.clusters - self.prev_clusters) == 0)) / N < self.halt:
break
return self
def rand_index_score(self, clusters, classes):
'''Compute the RandIndex
:param: Clusters: Cluster labels
:param: classses: Actual class
:returntype: float
'''
from scipy.special import comb
tp_fp = comb(np.bincount(clusters), 2).sum()
tp_fn = comb(np.bincount(classes), 2).sum()
A = np.c_[(clusters, classes)]
tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
for i in set(clusters))
fp = tp_fp - tp
fn = tp_fn - tp
tn = comb(len(A), 2) - tp - fp - fn
self.randindex = (tp + tn) / (tp + fp + fn + tn)
return self.randindex
data_2d_10c = pd.read_csv("./2d-10c.dat", header=None, sep=' ')[3:]
data_iris = pd.read_csv("./iris.csv", header=None)
data_long = pd.read_csv("./long.data", header=None, sep=' ')
data_order2_3clust = pd.read_csv("./order2-3clust.csv", header=None)
data_smile = pd.read_csv("./smile.csv", header=None)
data_square = pd.read_csv("./square.data", header=None, sep=' ')
data_heart = pd.read_csv("./heart.csv")
scaler = preprocessing.StandardScaler()
data_2d_10c_X = data_2d_10c.to_numpy()[:, :-1]
data_2d_10c_y = data_2d_10c.to_numpy()[:, -1]
data_iris_X = data_iris.to_numpy()[:, :-1]
data_iris_y = data_iris.to_numpy()[:, -1]
data_long_X = data_long.to_numpy()[:, :-1]
data_long_y = data_long.to_numpy()[:, -1]
data_order2_3clust_X = data_order2_3clust.to_numpy()[:, :-1]
data_order2_3clust_y = data_order2_3clust.to_numpy()[:, -1]
data_smile_X = data_smile.to_numpy()[:, :-1]
data_smile_y = data_smile.to_numpy()[:, -1]
data_square_X = data_square.to_numpy()[:, :-1]
data_square_y = data_square.to_numpy()[:, -1]
data_heart_X = data_heart.to_numpy()[:, :-1]
data_heart_y = data_heart.to_numpy()[:, -1]
data_2d_10c_X_normalized = scaler.fit_transform(data_2d_10c_X)
data_iris_X_normalized = scaler.fit_transform(data_iris_X)
data_long_X_normalized = scaler.fit_transform(data_long_X)
data_order2_3clust_X_normalized = scaler.fit_transform(data_order2_3clust_X)
data_smile_X_normalized = scaler.fit_transform(data_smile_X)
data_square_X_normalized = scaler.fit_transform(data_square_X)
data_heart_X_normalized = scaler.fit_transform(data_heart_X)
data_X = data_2d_10c_X
data_X_normalized = data_2d_10c_X_normalized
data_y = data_2d_10c_y
n_components = 10
n_dimensions = 2
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = DBSCAN(eps=0.62).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X_normalized)
predictions = kmeans.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kkmeans = kkMeans(k = n_components, gamma=0.95).fit_predict(data_X_normalized)
predictions = kkmeans.clusters
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
For this dataset EM produces the best result with a 99% score for the adjusted rand index. The data contains groups that aren't interwined where EM can easily fit gaussian mixture models for each group, unlike KMeans which splits the data in Voronoi-cells.
data_X = data_iris_X
data_X_normalized = data_iris_X_normalized
data_y = data_iris_y
n_components = 3
n_dimensions = 4
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = DBSCAN(eps=1).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X)
predictions = kmeans.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kkmeans = kkMeans(k = n_components, gamma=0.21).fit_predict(data_X)
predictions = kkmeans.clusters
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
target = data_iris.iloc[: , -1].values
label_names = {0:'Iris Setosa', 1:'Iris Versicolour', 2:'Iris Virginica'}
data_y = data_iris[data_iris.columns[-1]]
data = data_iris[data_iris.columns[:-1]]
data = (data - np.mean(data, axis=0)) / np.std(data, axis=0)
data = data.values
def plot_som_density(som, data):
colors = ['C0', 'C1', 'C2']
w_x, w_y = zip(*[som.winner(d) for d in data])
w_x = np.array(w_x)
w_y = np.array(w_y)
plt.figure(figsize=(10, 9))
plt.pcolor(som.distance_map().T, cmap='bone_r', alpha=.2)
plt.colorbar()
for c in np.unique(target):
idx_target = target==c
plt.scatter(w_x[idx_target]+.5+(np.random.rand(np.sum(idx_target))-.5)*.8,
w_y[idx_target]+.5+(np.random.rand(np.sum(idx_target))-.5)*.8,
s=50, c=colors[c-1], label=label_names[c])
plt.legend(loc='center right')
plt.grid()
plt.show()
def plot_som_clusters(som, som_shape, data):
winner_coordinates = np.array([som.winner(x) for x in data]).T
cluster_index = np.ravel_multi_index(winner_coordinates, som_shape)
for c in np.unique(cluster_index):
plt.scatter(data[cluster_index == c, 0],
data[cluster_index == c, 1], alpha=.7)
for centroid in som.get_weights():
plt.scatter(centroid[:, 0], centroid[:, 1], marker='x',
s=80, linewidths=35, color='k')
def classify(som, data_X, data_y):
winmap = som.labels_map(data_X, data_y)
default_class = np.sum(list(winmap.values())).most_common()[0][0]
result = []
for d in data:
win_position = som.winner(d)
if win_position in winmap:
result.append(winmap[win_position].most_common()[0][0])
else:
result.append(default_class)
return result
n_neurons = 3
m_neurons = 3
som = MiniSom(n_neurons, m_neurons, data.shape[1], sigma=0.1, learning_rate=.1,
neighborhood_function='gaussian', random_seed=0)
som.pca_weights_init(data)
som.train(data, 1000, verbose=True)
plot_som_density(som, data)
plot_som_clusters(som, (n_neurons, m_neurons), data)
predictions = classify(som, data, data_y)
plt.figure(figsize=(8, 8))
plt.scatter(data[:, 0], data[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
distance_map = som.distance_map().T
kmeans = KMeans(n_clusters=3, random_state=0).fit(distance_map)
predictions = kmeans.labels_
predictions
n_neurons = 6
m_neurons = 6
som = MiniSom(n_neurons, m_neurons, data.shape[1], sigma=0.1, learning_rate=.1,
neighborhood_function='gaussian', random_seed=0)
som.pca_weights_init(data)
som.train(data, 1000, verbose=True)
plot_som_density(som, data)
plot_som_clusters(som, (n_neurons, m_neurons), data)
kmeans = KMeans(n_clusters=3, random_state=0).fit(som.distance_map().T)
predictions = kmeans.labels_
predictions
predictions = classify(som, data, data_y)
plt.figure(figsize=(8, 8))
plt.scatter(data[:, 0], data[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
For this dataset EM produces the best result with a 90% score for the adjusted rand index.
data_X = data_long_X
data_X_normalized = data_long_X_normalized
data_y = data_long_y
n_components = 2
n_dimensions = 2
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = DBSCAN(eps=0.8, min_samples=1).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X_normalized)
predictions = kmeans.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kkmeans = kkMeans(k = n_components, gamma=0.21).fit_predict(data_X_normalized)
predictions = kkmeans.clusters
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X_normalized)
predictions = gm.predict(data_X_normalized)
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X_normalized)
predictions = som.predict(data_X_normalized)
plt.figure(figsize=(8, 8))
plt.scatter(data_X_normalized[:, 0], data_X_normalized[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
All the algorithms except Average and Complete linkage managed to obtain a 100% adjusted rand index, correctly identifying the clusters.
The dataset doesn't contain much noise or outliers. Single linkage performs well since it tends to form chains like the two clusters in our dataset. However, Complete and Average linkage fail since theyre more sensitive to outliers and tend to break up large clusters. KMeans can also separate them into Voronoi cells if the data is normalized, since the axis of the decision line is reversed. The clusters can be also fit by the EM Gaussian mixture models.
data_X = data_order2_3clust_X
data_X_normalized = data_order2_3clust_X_normalized
data_y = data_order2_3clust_y
n_components = 3
n_dimensions = 2
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = DBSCAN(eps=0.35, min_samples=2).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X_normalized)
predictions = kmeans.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kkmeans = kkMeans(k = n_components, gamma=1).fit_predict(data_X_normalized)
predictions = kkmeans.clusters
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
For this dataset DBSCAN produces the best result with a 75% score for the adjusted rand index.
DBSCAN has the advantage of finding arbitrarily-shaped clusters, the only condition is that the clusters shouldn't be connected. However, the points on the cluster borders might swap cluster membership, just like in our result.
data_X = data_smile_X
data_X_normalized = data_smile_X_normalized
data_y = data_smile_y
n_components = 4
n_dimensions = 2
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = DBSCAN().fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X)
predictions = kmeans.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kkmeans = kkMeans(k = n_components, gamma=3).fit_predict(data_X)
predictions = kkmeans.clusters
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
For this dataset Single Linkage and DBSCAN produce the best results with a 100% score for the adjusted rand index.
Single Linkages manages to correctly assign the data to clusters, since it is able to identify the countour of the face and the mouth as they are easily chained together. The same applies to DBSCAN, which can find arbitrarily-shaped clusters even if they are completely surrounded by another cluster, as long as they are not connected.
The other algorithms like EM performed poorly, since its hard for a Gaussian mixture model to correctly fit to the countour of the face, as well as fitting the inner features. The same reasoning applies to KMeans, which wont be able to find clusters that contain another.
data_X = data_square_X
data_X_normalized = data_square_X_normalized
data_y = data_square_y
n_components = 4
n_dimensions = 2
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=data_y)
plt.show()
clustering = AgglomerativeClustering(linkage='single', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='complete', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='average', n_clusters=n_components).fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = AgglomerativeClustering(linkage='ward', n_clusters=n_components).fit(data_X_normalized)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
clustering = DBSCAN().fit(data_X)
predictions = clustering.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kmeans = KMeans(n_clusters=n_components, random_state=0).fit(data_X)
predictions = kmeans.labels_
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
kkmeans = kkMeans(k = n_components, gamma=0.1).fit_predict(data_X)
predictions = kkmeans.clusters
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
gm = GaussianMixture(n_components=n_components, random_state=0).fit(data_X)
predictions = gm.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
som = SOM(m=n_components, n=1, dim=n_dimensions)
som.fit(data_X)
predictions = som.predict(data_X)
plt.figure(figsize=(8, 8))
plt.scatter(data_X[:, 0], data_X[:, 1], c=predictions)
plt.show()
print("Adjusted rand index: ", adjusted_rand_score(data_y, predictions))
For this dataset KMeans produces the best result with a 69% score for the adjusted rand index.
The Kernelized KMeans and EM algorithms also managed to get a 68% score for the adjusted rand index. The dataset contains a lot of noise which makes it hard for the algorithms to identify clusters, especially for those where we have to specify the number of clusters beforehand.
for n_clusters in range(2, 11):
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-1, 1])
ax1.set_ylim([0, len(data_iris_X) + (n_clusters + 1) * 10])
clusterer = KMeans(n_clusters=n_clusters, random_state=0)
cluster_labels = clusterer.fit_predict(data_iris_X)
silhouette_avg = silhouette_score(data_iris_X, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)
sample_silhouette_values = silhouette_samples(data_iris_X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(
data_iris_X[:, 0],
data_iris_X[:, 1],
marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)
centers = clusterer.cluster_centers_
ax2.scatter(
centers[:, 0],
centers[:, 1],
marker="o",
c="white",
alpha=1,
s=200,
edgecolor="k",
)
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
% n_clusters,
fontsize=14,
fontweight="bold",
)
plt.show()
distorsions = []
for k in range(2, 11):
kmeans = KMeans(n_clusters=k)
kmeans.fit(data_iris_X)
distorsions.append(kmeans.inertia_ / data_iris_X.shape[1])
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 11), distorsions)
plt.xticks(range(2, 11))
plt.grid(True)
plt.title('Elbow curve')
plt.show()
distorsions = []
for k in range(2, 11):
kmeans = KMeans(n_clusters=k)
kmeans.fit(data_iris_X)
predictions = kmeans.labels_
distorsions.append(adjusted_rand_score(data_iris_y, predictions))
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 11), distorsions)
plt.xticks(range(2, 11))
plt.grid(True)
plt.title('Adjusted rand score')
plt.show()
The Silhouette Width score on KMeans shows that the best number of clusters should be 2. However, studying the silhouette plot, we can see that even for 3 clusters there aren't any points that were assigned to the wrong cluster. Moreover, the points aren't predominately grouped in the first cluster like in the first plot, instead being more evenly distributed among the 3 clusters. This intuition is confirmed by the intra-class variance and the adjusted rand score plots that show a degradation starting from 3 clusters instead of 2.
for n_clusters in range(2, 11):
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
ax1.set_xlim([-1, 1])
ax1.set_ylim([0, len(data_iris_X) + (n_clusters + 1) * 10])
clusterer = AgglomerativeClustering(linkage='average', n_clusters=n_clusters).fit(data_iris_X)
cluster_labels = clusterer.labels_
silhouette_avg = silhouette_score(data_iris_X, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)
sample_silhouette_values = silhouette_samples(data_iris_X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(
data_iris_X[:, 0],
data_iris_X[:, 1],
marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(
"Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
% n_clusters,
fontsize=14,
fontweight="bold",
)
plt.show()
distorsions = []
for k in range(2, 11):
averageLinkage = AgglomerativeClustering(linkage='average', n_clusters=k)
averageLinkage.fit(data_iris_X)
predictions = averageLinkage.labels_
distorsions.append(adjusted_rand_score(data_iris_y, predictions))
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 11), distorsions)
plt.xticks(range(2, 11))
plt.grid(True)
plt.title('Adjusted rand index')
plt.show()
def plot_dendrogram(model, **kwargs):
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
dendrogram(linkage_matrix, **kwargs)
averageLinkage = AgglomerativeClustering(linkage='average', compute_distances=True)
averageLinkage = averageLinkage.fit(data_iris_X)
plt.figure(figsize=(8, 8))
plt.title(f"Dendogram")
plot_dendrogram(averageLinkage)
plt.axhline(y=3, color="black", linestyle="--")
plt.show()
For hierarchical clustering with average linkage, the silhouette score shows again that the number of clusters should be 2. Again, further inspecting the silhouette plot, we can see that for 3 clusters we have less points assigned to the wrong cluster, as well as them being more evenly distributed among clusters. The adjusted rand score also shows that the number of clusters should be 3.
The Average Linkage dendogram shows that theres a significant distance between the green and the red clusters. We could conclude that the number of clusters should be 2, since if we were to cut the dendogram further down the distances aren't that significant. However, if we were to correlate the dendogram to the scatter plot, we would see that the red cluster is formed by two other ones that are really close together. Thus, a better cutting point would be 1.8, which would result in 3 clusters.
distorsions = []
for k in range(2, 11):
kmeans = KMeans(n_clusters=k).fit(data_heart_X_normalized)
predictions = kmeans.labels_
cm = confusion_matrix(data_heart_y, predictions)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()
print(f"Adjusted rand index {k} clusters: ", adjusted_rand_score(data_y, predictions))
distorsions.append(kmeans.inertia_ / data_iris_X.shape[1])
fig = plt.figure(figsize=(15, 5))
plt.plot(range(2, 11), distorsions)
plt.xticks(range(2, 11))
plt.grid(True)
plt.title('Elbow curve')
plt.show()